{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 使用SQL服务器加载数据库"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "为加快加载速度,需要在mySQL中设置`local_infile`为true.以管理员权限在mySQL中运行以下命令:\n",
    "\n",
    "```sql\n",
    "SET GLOBAL local_infile = true;\n",
    "```\n",
    "\n",
    "更新以下配置参数,注意不要修改 `allow_local_infile`设置."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from cpiextract.sql_connection import connect_to_mysql\n",
    "\n",
    "sql_config = {\n",
    "    \"host\": \"XXX.XX.XX.XXX\",\n",
    "    \"user\": \"user\",\n",
    "    \"password\": \"password\",\n",
    "    \"database\": \"cpie\",\n",
    "    \"allow_local_infile\": True\n",
    "}\n",
    "\n",
    "cnx = connect_to_mysql(sql_config, attempts=3)\n",
    "\n",
    "assert cnx and cnx.is_connected()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "载入dbs_config文件,注意修改数据文件的路径:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_path = '/home/regen/projects/cpie_dbs' # 自行修改,含有数据库和dbs_cinfig.json的路径"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "\n",
    "json_file_path = os.path.join(data_path, 'dbs_config.json')\n",
    "\n",
    "# Load the JSON data\n",
    "with open(json_file_path, 'r') as f:\n",
    "    dbs_config = json.load(f)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 创建SQL表格并载入数据库"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_table(cursor, table_name, headers, header_types):\n",
    "    # Define column types based on your data\n",
    "    column_types = ['INT AUTO_INCREMENT PRIMARY KEY'] + [header_types[h] for h in headers]\n",
    "    columns = ', '.join([f\"`{header}` {col_type}\" for header, col_type in zip(['id'] + headers, column_types)])\n",
    "    # column_types = [header_types[h] for h in headers]\n",
    "    # columns = ', '.join([f\"{header} {col_type}\" for header, col_type in zip(headers, column_types)])\n",
    "    query = f\"DROP TABLE IF EXISTS {table_name}\"\n",
    "    cursor.execute(query)\n",
    "\n",
    "    query = f\"CREATE TABLE IF NOT EXISTS {table_name} ({columns})\"\n",
    "    # Execute the CREATE TABLE query\n",
    "    cursor.execute(query)\n",
    "    print('Table created')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_csv_to_table(cnx, config):\n",
    "    cursor = cnx.cursor()\n",
    "\n",
    "    file_path = 'data/'+ str(config[\"file\"])\n",
    "    table_name = config[\"name\"]\n",
    "    sep = ',' if config[\"file\"].endswith(\".csv\") else '\\t'\n",
    "\n",
    "    # Open the TSV file and create the table if it doesn't exist\n",
    "    with open(file_path, 'r') as file:\n",
    "        headers = file.readline().strip().split(sep)\n",
    "        # Create the table\n",
    "        create_table(cursor, table_name, headers, config[\"columns\"])  \n",
    "\n",
    "        # Prepare the INSERT query  \n",
    "        query = f\"\"\"\n",
    "            LOAD DATA LOCAL INFILE '{file_path}' INTO TABLE {table_name}\n",
    "            FIELDS TERMINATED BY '{sep}'\n",
    "            OPTIONALLY ENCLOSED BY '\"'\n",
    "            LINES TERMINATED BY '\\n'\n",
    "            IGNORE 1 ROWS\n",
    "            ({','.join([f\"`{header}`\" for header in headers])});\n",
    "        \"\"\"\n",
    "        \n",
    "        cursor.execute(query)\n",
    "        # Commit changes and close the cursor and connection\n",
    "        cnx.commit()\n",
    "        print(\"Database loaded\")\n",
    "        cursor.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## BindingDB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "config = dbs_config[\"BindingDB\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "load_csv_to_table(cnx, config)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## ChEMBL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "config = dbs_config[\"ChEMBL\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "load_csv_to_table(cnx, config)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CTD"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "config = dbs_config[\"CTD\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "load_csv_to_table(cnx, config)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Drugbank"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "config = dbs_config[\"DrugBank\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "load_csv_to_table(cnx, config)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## DrugCentral"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "config = dbs_config[\"DrugCentral\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "load_csv_to_table(cnx, config)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## DTC"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "config = dbs_config[\"DTC\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "load_csv_to_table(cnx, config)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## STITCH"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "config = dbs_config[\"STITCH\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "load_csv_to_table(cnx, config)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "cpi",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
